Exercise 1

Here I will create a new SQLite database in my database folder, and then check if it exists.

#Load the packages.
library(DBI)
library(RSQLite)

#Create a new databse in my databse folder.
dir.create("database")
new_folder <- file.path(getwd(), "database")
db_path <- file.path(new_folder, "my_database.sqlite")
con <- DBI::dbConnect(RSQLite::SQLite(), dbname = db_path)
my_db <- "my_database.sqlite"

#Check for the existence of my relational database.
setwd("database")
if (file.exists(my_db)) {
  cat("The database file exists!\n")
} else {
  cat("The database file does not exist.\n")
}
## The database file exists!

Exercise 2.a Gathering structured data

Here I will write a automatic function to extract a table of all R1 (Very High Research Activity) and R2 (High Research Activity) Research Institutions in the US from a Wikipedia page (https://en.wikipedia.org/wiki/List_of_research_universities_in_the_United_States). The table will include the names, control status, location and Wikipedia page url of those universities.

## # A tibble: 6 × 5
##   Institution              Control              City        State Wikipedia_Link
##   <chr>                    <chr>                <chr>       <chr> <chr>         
## 1 Arizona State University Public               Tempe       AZ    https://en.wi…
## 2 Auburn University        Public               Auburn      AL    https://en.wi…
## 3 Baylor University        Private (non-profit) Waco        TX    https://en.wi…
## 4 Binghamton University    Public               Vestal      NY    https://en.wi…
## 5 Boston College           Private (non-profit) Chestnut H… MA    https://en.wi…
## 6 Boston University        Private (non-profit) Boston      MA    https://en.wi…

##Exercise 2.b Gathering unstructured data Here, I will write a new scraping function to gather the geographic coordinates, the endowment (USD) and the total number of students of each institution.

##                Institution              Control          City State
## 1 Arizona State University               Public         Tempe    AZ
## 2        Auburn University               Public        Auburn    AL
## 3        Baylor University Private (non-profit)          Waco    TX
## 4    Binghamton University               Public        Vestal    NY
## 5           Boston College Private (non-profit) Chestnut Hill    MA
## 6        Boston University Private (non-profit)        Boston    MA
##                                           Wikipedia_Link           Coordinates
## 1 https://en.wikipedia.org/wiki/Arizona_State_University  33.4209°N 111.9340°W
## 2        https://en.wikipedia.org/wiki/Auburn_University     32.603°N 85.486°W
## 3        https://en.wikipedia.org/wiki/Baylor_University     31.548°N 97.116°W
## 4    https://en.wikipedia.org/wiki/Binghamton_University   42.0888°N 75.9670°W
## 5           https://en.wikipedia.org/wiki/Boston_College 42.33500°N 71.17028°W
## 6        https://en.wikipedia.org/wiki/Boston_University 42.34889°N 71.10028°W
##   Endowment Total_Students
## 1  1.47e+09         142636
## 2  1.05e+09          33015
## 3  1.97e+09          20626
## 4 148100000          18148
## 5   3.3e+09          14890
## 6   3.2e+09          36729

Exercise 2.c Data munging

In this part, I will add 3 new columns to my previous table based on the file “ivyleague.csv”. The new table will indicate every institution’s Ivy League membership, county and state, and the EIN number for Ivy League members.

##                Institution              Control          City State
## 1 Arizona State University               Public         Tempe    AZ
## 2        Auburn University               Public        Auburn    AL
## 3        Baylor University Private (non-profit)          Waco    TX
## 4    Binghamton University               Public        Vestal    NY
## 5           Boston College Private (non-profit) Chestnut Hill    MA
## 6        Boston University Private (non-profit)        Boston    MA
##                                           Wikipedia_Link           Coordinates
## 1 https://en.wikipedia.org/wiki/Arizona_State_University  33.4209°N 111.9340°W
## 2        https://en.wikipedia.org/wiki/Auburn_University     32.603°N 85.486°W
## 3        https://en.wikipedia.org/wiki/Baylor_University     31.548°N 97.116°W
## 4    https://en.wikipedia.org/wiki/Binghamton_University   42.0888°N 75.9670°W
## 5           https://en.wikipedia.org/wiki/Boston_College 42.33500°N 71.17028°W
## 6        https://en.wikipedia.org/wiki/Boston_University 42.34889°N 71.10028°W
##   Endowment Total_Students IvyLeague_Member County EIN
## 1  1.47e+09         142636               No   <NA>  NA
## 2  1.05e+09          33015               No   <NA>  NA
## 3  1.97e+09          20626               No   <NA>  NA
## 4 148100000          18148               No   <NA>  NA
## 5   3.3e+09          14890               No   <NA>  NA
## 6   3.2e+09          36729               No   <NA>  NA

Exercise 2.d Writing to the relational database

Here I will write the final table generated before into my relational database, and define a function “checkTable()” to check for the existence and correct dimensionality of my written tables.

The difinition of the function and the output can be found as follows.

# Create a checking function.
checkTable <- function(database_name, table_name) {
  # Connect to the database
  #con <- dbConnect(RSQLite::SQLite(), dbname = database_name)
  
  # Check if the table exists
  if (dbExistsTable(con, table_name)) {
    # Get the table information
    table_info <- dbGetQuery(con, paste0("PRAGMA table_info(", table_name, ")"))
    
    # Get number of rows and columns
    table_dimensions <- dbGetQuery(con, paste0("SELECT COUNT(*) AS num_rows, COUNT(*) AS num_columns FROM ", table_name))
    
    # Get column names
    column_names <- table_info$name
    
    # Output information
    cat("Number of Rows:", table_dimensions$num_rows, "\n")
    cat("Number of Columns:", table_dimensions$num_columns, "\n")
    cat("Column Names:", column_names, "\n")
  } else {
    cat("The table does not exist.")
  }
  
  # Close the database connection
  dbDisconnect(con)
}
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
checkTable("my_database.sqlite", "Exercise2_Universities_Data")
## Number of Rows: 279 
## Number of Columns: 279 
## Column Names: Institution Control City State Wikipedia_Link Coordinates Endowment Total_Students IvyLeague_Member County EIN

Exercise 3

In this exercise, I will use the RSelenium package to fetch the ranking data from the Academic Ranking of World Universities (https://www.shanghairanking.com/).

Exercise 3.a Scraping annual rank

Here, I will create a webscraper to gather the ARWU ranking for the Ivy League institutions across the years 2002, 2013 and 2023, and modify the ranking data to integer.

##                  Institution_Year  Rank
## 1 University of Pennsylvania_2023    14
## 2           Brown University_2023    98
## 3        Columbia University_2023     8
## 4         Cornell University_2023    12
## 5          Dartmouth College_2023 350.5
## 6         Harvard University_2023     1

Call the function checkTable() to check for existence and dimensionality of the final table.

# Function to check table existence and dimensionality
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
checkTable("my_database.sqlite", "ARWU_ranks")
## Number of Rows: 24 
## Number of Columns: 24 
## Column Names: Institution_Year Rank

Exercise 3.b Scraping subject ranks for 2023

Here, I will scrape all social science subjects and their ranks for each Ivy League university in 2023.

## # A tibble: 6 × 2
##   Institution_Subject                  Rank 
##   <chr>                                <chr>
## 1 University of Pennsylvania_Economics 14   
## 2 Brown University_Economics           46   
## 3 Columbia University_Economics        7    
## 4 Cornell University_Economics         22   
## 5 Dartmouth College_Economics          63   
## 6 Harvard University_Economics         2

Call the function checkTable() to check for existence and dimensionality of the final table.

# Function to check table existence and dimensionality.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
checkTable("my_database.sqlite", "Subject_ranks")
## Number of Rows: 112 
## Number of Columns: 112 
## Column Names: Institution_Subject Rank

Exercise 4

Here, I will use the package “tidycensus” to access a raw API and gather financial data (2011 - 2021) as well as local economic data (2015 and 2020) for Ivy League members.

Exercise 4.a Gathering financial data from a raw API

In the first table, I retrieved the Total Revenue and Total Assets of each Ivy League institution for the years 2011 - 2021.

##                  Institution_Year Total_Revenue Total_Assets
## 1 University of Pennsylvania_2021    9337742000  31220689000
## 2 University of Pennsylvania_2020    7624539000  24607104000
## 3 University of Pennsylvania_2019    7942744000  22754777000
## 4 University of Pennsylvania_2018    7238854000  21301691000
## 5 University of Pennsylvania_2017    6710820000  19054006000
## 6 University of Pennsylvania_2016    6286195000  17580872000

Call the function created in 2d to check for the existence as well as dimensionality of the final table. The output is as follows.

# Check for existence and dimensionality.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
checkTable("my_database.sqlite", "Financial_data")
## Number of Rows: 82 
## Number of Columns: 82 
## Column Names: Institution_Year Total_Revenue Total_Assets

Exercise 4.b Gathering local economic data from a packaged API

Based on the data collected before and the tidycensus package, I will here first retrieve all counties in the US and their estimated median household income for both 2015 and 2020. The output tables can be found as follows.

## # A tibble: 3,220 × 5
##    GEOID NAME                     variable   estimate   moe
##    <chr> <chr>                    <chr>         <dbl> <dbl>
##  1 01001 Autauga County, Alabama  B19013_001    51281  2391
##  2 01003 Baldwin County, Alabama  B19013_001    50254  1263
##  3 01005 Barbour County, Alabama  B19013_001    32964  2973
##  4 01007 Bibb County, Alabama     B19013_001    38678  3995
##  5 01009 Blount County, Alabama   B19013_001    45813  3141
##  6 01011 Bullock County, Alabama  B19013_001    31938  5884
##  7 01013 Butler County, Alabama   B19013_001    32229  1793
##  8 01015 Calhoun County, Alabama  B19013_001    41703   925
##  9 01017 Chambers County, Alabama B19013_001    34177  2949
## 10 01019 Cherokee County, Alabama B19013_001    36296  1710
## # ℹ 3,210 more rows
## # A tibble: 3,221 × 5
##    GEOID NAME                     variable   estimate   moe
##    <chr> <chr>                    <chr>         <dbl> <dbl>
##  1 01001 Autauga County, Alabama  B19013_001    57982  4839
##  2 01003 Baldwin County, Alabama  B19013_001    61756  2268
##  3 01005 Barbour County, Alabama  B19013_001    34990  2909
##  4 01007 Bibb County, Alabama     B19013_001    51721  6237
##  5 01009 Blount County, Alabama   B19013_001    48922  2269
##  6 01011 Bullock County, Alabama  B19013_001    33866 10094
##  7 01013 Butler County, Alabama   B19013_001    44850  2891
##  8 01015 Calhoun County, Alabama  B19013_001    50128  1964
##  9 01017 Chambers County, Alabama B19013_001    43875  3861
## 10 01019 Cherokee County, Alabama B19013_001    42509  3824
## # ℹ 3,211 more rows

Then, I will match the local economic data with the Ivy League data collected earlier, creating a new Ivy League universities table to show the counties where each institution is located and the local estimated median household income data for 2015 and 2020.

##                  Institution_Year                            County
## 1 University of Pennsylvania_2015 Philadelphia County, Pennsylvania
## 2 University of Pennsylvania_2020 Philadelphia County, Pennsylvania
## 3           Brown University_2015   Providence County, Rhode Island
## 4           Brown University_2020   Providence County, Rhode Island
## 5        Columbia University_2015         New York County, New York
## 6        Columbia University_2020         New York County, New York
##   Estimated_median_household_income
## 1                             38253
## 2                             49127
## 3                             49743
## 4                             62323
## 5                             72871
## 6                             89812

After I have collected the data I need and written it into my relational database, I will then use the checkTable function defined in 2d to check the existence and correct dimensionality of the final table. The outcome is as follows.

# Check for the existence and correct dimensionality.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
checkTable("my_database.sqlite", "Ivy_League_with_Economic_Data")
## Number of Rows: 16 
## Number of Columns: 16 
## Column Names: Institution_Year County Estimated_median_household_income

Exercise 5.a Analysis and visualisation

In this exercise, I will first collect a table indicating the Ivy League institutions’ rank and financial data. Then I will use 4 plots to interpret the relationships between these variables.

The following table illustrates: 1. Institution name 2. The average rank of the university across 2003, 2013, and 2023 3. The average rank of the university’s Economics, Political Science, and Sociology programs, if they were ranked 4. The current endowment per student (total endowment divided by total number of students), in USD 5. The average total revenue per student across the years 2015 - 2020, in USD 6. The average of the median household income for the County across the years 2015 and 2020, in USD

##            Institution Avg_ranking Economics_Rank Political_Science_Rank
## 1     Brown University          71             46                     63
## 2  Columbia University           9              7                      8
## 3   Cornell University          12             22                     32
## 4    Dartmouth College         218             63                     45
## 5   Harvard University           1              2                      2
## 6 Princeton University           7              6                      3
##   Sociology_Rank Endowment_per_Student Avg_Revenue_Per_Student
## 1             32              633093.5                125638.6
## 2             20              382381.7                156524.7
## 3             11              380459.6                181180.6
## 4             NA             1286319.6                209856.2
## 5              1             2345810.4                281756.9
## 6             14             4222694.0                328221.7
##   Avg_median_household_income
## 1                       56033
## 2                       81342
## 3                       56993
## 4                       61619
## 5                       95660
## 6                       78055

Exercise 5a Plot 1

In Figure 1, the line graph shows the ranks for 3 different social subjects in terms of Economics, Political Science and Sociology, and the overall average rank of every Ivy League universities in 2023. Overall, the subject rankings are roughly in line with the overall average rankings. But it is interesting to find that while Dartmouth College was ranked the lowest among all the Ivy League institutions with a number of over 200, it demonstrates a remarkable academic strength in Economics and Political Science. ## Exercise 5a Plot 2 In Figure 2, we can observe the average rank and endowment per student of each Ivy League universities. In terms of the ranking data, Dartmouth College shows a great distance from other Ivy League members, with a figure of 218, compared to other universities that are ranked under 100. Notablly, Columbia University and Cornell University have obtained prominent ranks of 9 and 12 respectively with the lowest endowment per student among all the 8 institutions, while other members such as Princeton University has accquired a ranking of 7 with over seven times of per student endowment from Columbia and Cornell. ## Exercise 5a Plot 3 In Figure 3, the line chart illustrates the average endowment per student of Ivy League institutions, and the average median household income of where these institutions are located. It is obvious that the local income is considerably less than the endowment of the universities. Particularly noteworthy is the fact that the endowment varies dramatically among the eight institutions as well, with Columbia University and Cornell Univeristy showing the lowest point and Princeton University the highest on the scale. That’s a notable difference of over 7 times. ## Exercise 5a Plot 4 In Figure 4, the line graph reflects the average median household income of where the Ivy League institutions are located, as well as the average revenue per student of each university. Overall, the average revenue per student of every institution is much higher than the local average median household incom, with Yale University being the most notable. In contrast to Yale, Brown University demonstrate the minimal income disparity, with the lowest average per student revenue of $ 125639 among all the 8 institutions. ## Exercise 5.b Visualisation of geographic data In this exercise, I will first call into R from my relational database a table that includes data needed for visualisation.

Since the follow-up visualisation work needs to use separate numerical types of latitude and longitude coordinates, I will use Rsql to convert the string-type coordinates collected earlier for further processing. The final obtained table is as follows. The table contains the information of the institution’s name, coordinate values, status, and Ivy League membership.

##                Institution Latitude  Longitude               Status
## 1 Arizona State University 33.42090 -111.93400               Public
## 2        Auburn University 32.60300  -85.48600               Public
## 3        Baylor University 31.54800  -97.11600 Private (non-profit)
## 4    Binghamton University 42.08880  -75.96700               Public
## 5           Boston College 42.33500  -71.17028 Private (non-profit)
## 6        Boston University 42.34889  -71.10028 Private (non-profit)
##   IvyLeague_Member
## 1               No
## 2               No
## 3               No
## 4               No
## 5               No
## 6               No

Next, I will use the data to generate a interactive mapping of research institutions in the United States.

As it is evident from Figure 5, Ivy League schools are concentrated in the northeastern part of the mainland U.S., where 75 percent of a coastal distribution is found. Overall, the R1 and R2 institutions are more densely located in the East than in the West of mainland US, and are more concentrated along the coast than inland, comprising a much larger share of public institutions than the privates.

It is also notable that in terms of the allocation of research resources, northwestern states such as Wyoming, Nevada, Montana, and South Dakota are particularly deprived.

To understand the geographic distribution patterns of research institutions I have observed, multiple perspectives and factors could be taken into concern. Initially, in my point of view, the economic and industrial factor is of prime importance. As European colonists first landed from the Atlantic coast of the northeastern United States, the northeastern industrial region became the earliest industrial area and the first growth area in the nation. The region was initially dominated by traditional industries such as coal, steel, machinery, automobiles, and chemicals, leading to the accumulation of capital and resources during the early period of history. The economic and industrial development thereby contributed to higher concentrations of educational resources in these areas due to collaboration opportunities, funding availability, and industry partnerships. Secondly, historical and cultural factors are indispensable. Many research institutions (especially the Ivy League members) have deep cultural roots shaped by the history. Combined with the colonial history of the mainland US, we may assume that the early settlement of colonisers and the presence of wealth families might influence the establishment of research institutions. Additionally, governmental policies, funding initiatives, and investments in education, research, and technology might have favored certain regions, leading to disparities in research resource allocation. To further investigate this case, more detailed data such as demographic statistics and policy information need to be involved.

Appendix: All code in this assignment

knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(echo = FALSE) 

#Load the packages.
library(DBI)
library(RSQLite)

#Create a new databse in my databse folder.
dir.create("database")
new_folder <- file.path(getwd(), "database")
db_path <- file.path(new_folder, "my_database.sqlite")
con <- DBI::dbConnect(RSQLite::SQLite(), dbname = db_path)
my_db <- "my_database.sqlite"

#Check for the existence of my relational database.
setwd("database")
if (file.exists(my_db)) {
  cat("The database file exists!\n")
} else {
  cat("The database file does not exist.\n")
}


# Load the packages.
library(RSelenium)
library(tidyverse)
library(rvest)
library(xml2)
library(netstat)

# Set up the driver and navigate to the target url.
rD <- rsDriver(browser=c("firefox"), verbose = F, port = 2345L, chromever = NULL) 
driver <- rD[["client"]]
url <- 'https://en.wikipedia.org/wiki/List_of_research_universities_in_the_United_States'
driver$navigate(url)

# Write a function to get tables and urls automatically.
table_scrape <- function() {
  src <- driver$getPageSource() # Get source code of the page
  
  # Get all the tables' html on the Wikipedia page.
  result_html <- read_html(src[[1]]) %>%
    html_elements("table")
  
  # Scrape all the tables.
  result_table <- result_html %>%
    html_table()

  # Create a blank list to store the final table results.
  extracted_table <- list()
  
  # Use a for loop to traverse the first two tables (the R1 and R2) to extract the relevant URL of each institution.
  # Then combine the URL columns into the original tables, and store them respectively into the list created before.
  for (i in c(1,2)) {
    links <- result_html[[i]]%>%
      html_nodes("td:nth-child(1) a") %>%
      html_attr("href")
    links <- paste0('https://en.wikipedia.org', links)
    links_df <- data.frame(Wikipedia_Link = links)
    new_table <- bind_cols(result_table[[i]], links_df)
    extracted_table[[i]] <- new_table
  }
  
  # Now the item "extracted_table" contains the R1 and R2 tables we need. Here I will bind the two tables as a final one.
  R1_table <- extracted_table[1]
  R2_table <- extracted_table[2]
  final_table <- bind_rows(R1_table, R2_table)
  
  # Return the function result.
  return(final_table)
}

# Scrape the data.
structured_tables <- table_scrape()

# View the final table.
head(structured_tables)


# First, define a function for converting endowment string-type content to numeric variables.
# In this case, the final endowment data will be displayed in a more consistent format.
convert_to_number <- function(str) {
  multiplier <- 1
  if (grepl("million", str, ignore.case = TRUE)) {
    multiplier <- 1e6 
  } else if (grepl("billion", str, ignore.case = TRUE)) {
    multiplier <- 1e9 
  }
  number <- as.numeric(gsub("[^0-9.]", "", str)) * multiplier
  return(number)
}

# Create a function to gather unstructured data.
extractUnstructuredData <- function(structured_data) {
  # Create blank vectors for storage.
  coordinates <- vector("character", length = nrow(structured_data))
  endowment <- vector("numeric", length = nrow(structured_data))
  undergrad <- vector("numeric", length = nrow(structured_data))
  postgrad <- vector("numeric", length = nrow(structured_data))
  students <- vector("numeric", length = nrow(structured_data))
  total_students <- vector("numeric", length = nrow(structured_data))
  
  # Scrape the data.
  for (i in 1:nrow(structured_data)) {
    university_url <- structured_data$Wikipedia_Link[i]
    
    # Read the HTML content of the university's page.
    university_page <- read_html(university_url)
    
    # Extracting coordinates.
    coordinates_node <- university_page %>%
      html_nodes("span.geo-dec") %>%
      html_text() %>%
      first()
    coordinates[i] <- coordinates_node
    
    # Extracting endowment.
    endowment_node <- university_page %>%
      html_nodes(".infobox tbody tr:contains('Endowment') td:nth-child(2)") %>%
      html_text() 
    endowment[i] <- ifelse(length(endowment_node) > 0, endowment_node, "Not Available")
    # Delete the footage.
    endowment[i] <- gsub("\\[\\d+\\]", "", endowment[i])
    # Convert to numeric variables.
    endowment[i] <- gsub("\\(.*\\)", "", endowment[i]) %>% convert_to_number()
    
    ## Extracting total number of students.
    # Extract the number of undergraduates.
    undergrad_node <- university_page %>%
      html_nodes(".infobox tbody tr:contains('Undergraduates') td:nth-child(2)") %>%
      html_text() 
    undergrad_node <- str_extract(undergrad_node, "(\\d{1,3},\\d{3})")
    # Cleansing the data.
    undergrad_node <- gsub(",", "", undergrad_node)
    # Retain the NA value.
    undergrad[i] <- ifelse(length(undergrad_node) > 0, undergrad_node, NA) %>% as.numeric()
    
    # Extracting the number of postgraduates.
    postgrad_node <- university_page %>%
      html_nodes(".infobox tbody tr:contains('Postgraduates') td:nth-child(2)") %>%
      html_text() 
    postgrad_node <- str_extract(postgrad_node, "(\\d{1,3},\\d{3})")
    # Cleansing the data.
    postgrad_node <- gsub(",", "", postgrad_node)
    # Retain the NA value
    postgrad[i] <- ifelse(length(postgrad_node) > 0, postgrad_node, NA) %>% as.numeric()
    
    # Extracting the number of students (listed in the information box), in case the institution doesn't have separate undergraduate and postgraduate student numbers.
    students_node <- university_page %>%
      html_nodes(".infobox-label:contains('Students') + .infobox-data") %>%
      html_text() 
    students_node <- str_extract(students_node, "(\\d{1,3},\\d{3})")
    # Cleansing the data.
    students_node <- gsub(",", "", students_node)
    # Retain the NA value.
    students[i] <- ifelse(length(students_node) > 0, students_node, NA) %>% as.numeric()
    
    # Calculate the total number of students by adding undergraduates number and postgraduates number. Then compare the outcome with the students number extracted above.
    # Retain the greater value as the final total number of students.
    if (!is.na(undergrad[i]) | !is.na(postgrad[i])) {
      undergrad[is.na(undergrad)] <- 0
      postgrad[is.na(postgrad)] <- 0
      students[is.na(students)] <- 0
      total_students[i] <- undergrad[i] + postgrad[i]
      if (total_students[i] < students[i]) {
        total_students[i] <- students[i]
      }
    }
    else {total_students[i] <- students[i]}
  }
  
  # Adding the extracted data to the structured data.
  structured_data$Coordinates <- coordinates
  structured_data$Endowment <- endowment
  structured_data$Total_Students <- total_students
  
  # Return function outcome.
  return(structured_data)
}

# Extract unstructured data.
complete_data <- extractUnstructuredData(structured_tables)
# View the final table.
head(complete_data)

# Read the csv file of ivy league members.
ivydata <- read.csv("ivyleague.csv")
# Rename the institutions.
ivydata$uni_name <- c("University of Pennsylvania", "Brown University", "Columbia University", "Cornell University", "Dartmouth College", "Harvard University", "Princeton University", "Yale University")

# Write this table into my database for further analysis.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
dbWriteTable(con, "Ivy_League", ivydata, row.names = FALSE, overwrite = TRUE)

# Convert the county and state to only one column.
ivydata$County <- paste(ivydata$county, ivydata$state, sep = ", ")

# Add new columns to the previous table.
for (i in 1:nrow(complete_data)) {
  # Match the R1 and R2 institutions with Ivy League membership and add values.
  if (complete_data$Institution[i] %in% ivydata$uni_name) {
  row_number <- which(ivydata$uni_name == complete_data$Institution[i])
  
  complete_data$IvyLeague_Member[i] <- "Yes"
  complete_data$County[i] <- ivydata$County[row_number]
  complete_data$EIN[i] <- ivydata$ein[row_number]
  }
  else {complete_data$IvyLeague_Member[i] <- "No"
  complete_data$County[i] <- NA
  complete_data$EIN[i] <- NA}
}

# View the final table.
head(complete_data)


# Create your database connection.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)

# Write the tidy table to your database.
dbWriteTable(con, "Exercise2_Universities_Data", complete_data, row.names = FALSE, overwrite = TRUE)

# Create a primary key.
dbExecute(con, "CREATE UNIQUE INDEX pk_Institution ON Exercise2_Universities_Data(Institution)")
# Create a checking function.
checkTable <- function(database_name, table_name) {
  # Connect to the database
  #con <- dbConnect(RSQLite::SQLite(), dbname = database_name)
  
  # Check if the table exists
  if (dbExistsTable(con, table_name)) {
    # Get the table information
    table_info <- dbGetQuery(con, paste0("PRAGMA table_info(", table_name, ")"))
    
    # Get number of rows and columns
    table_dimensions <- dbGetQuery(con, paste0("SELECT COUNT(*) AS num_rows, COUNT(*) AS num_columns FROM ", table_name))
    
    # Get column names
    column_names <- table_info$name
    
    # Output information
    cat("Number of Rows:", table_dimensions$num_rows, "\n")
    cat("Number of Columns:", table_dimensions$num_columns, "\n")
    cat("Column Names:", column_names, "\n")
  } else {
    cat("The table does not exist.")
  }
  
  # Close the database connection
  dbDisconnect(con)
}
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
checkTable("my_database.sqlite", "Exercise2_Universities_Data")


# Navigate to the ARWU website.
url3 <- 'https://www.shanghairanking.com/'
driver$navigate(url3)

# Find the explore button and click it.
Explore_button <- driver$findElement(using = "xpath", value = "/html/body/div/div/div/div[2]/div[2]/div[2]/div[1]/button")
Explore_button$clickElement()

# Create a blank dataframe for storage.
ARWU_Rank <- data.frame(matrix(nrow = 24, ncol = 2))
colnames(ARWU_Rank) <- c("Institution_Year", "Rank")

# Wait for the search field to be visible.
Sys.sleep(5)
# Find the search field.
Search_field <- driver$findElement(using = 'css selector', value = ".search-input")

## Scrape the 2023 ranking data.
for (i in 1:nrow(ivydata)) {
  Search_field$sendKeysToElement(list(ivydata$uni_name[i]))
  Search_field$sendKeysToElement(list(key = "enter"))
  rank_2023 <- driver$findElement(using = 'class name', value = 'ranking')$getElementText()
  ARWU_Rank$Institution_Year[i] <- paste0(ivydata$uni_name[i], "_2023")
  ARWU_Rank$Rank[i] <- rank_2023[[1]]
  Search_field$clickElement()
  Search_field$sendKeysToElement(list(key = "control", "A"))
  Search_field$sendKeysToElement(list(key = "delete"))
}

## Scrape the 2013 ranking data.
# Switch to the 2013 ranking page.
year_select_button <- driver$findElement(using = "xpath", value = "/html/body/div/div/div/div[2]/div/div[1]/div[2]/div[2]/div/div[1]/div")
year_select_button$clickElement()
button2013 <- driver$findElement(using = "css selector", value = "div.rank-select:nth-child(2) > div:nth-child(2) > ul:nth-child(1) > li:nth-child(11)")
button2013$clickElement()

# Find the search field again to avoid errors.
Search_field <- driver$findElement(using = "xpath", value = "/html/body/div/div/div/div[2]/div/div[2]/div/div[1]/div/div[1]/input")

for (i in 1:nrow(ivydata)) {
  #Search_field$clickElement()
  Search_field$sendKeysToElement(list(ivydata$uni_name[i]))
  Search_field$sendKeysToElement(list(key = "enter"))
  rank_2013 <- driver$findElement(using = 'class name', value = 'ranking')$getElementText()
  ARWU_Rank$Institution_Year[i+8] <- paste0(ivydata$uni_name[i], "_2013")
  ARWU_Rank$Rank[i+8] <- rank_2013[[1]]
  Search_field$clickElement()
  Search_field$sendKeysToElement(list(key = "control", "A"))
  Search_field$sendKeysToElement(list(key = "delete"))
}

## Scrape the 2003 ranking data.
# Switch to the 2003 ranking page.
year_select_button <- driver$findElement(using = "xpath", value = "/html/body/div/div/div/div[2]/div/div[1]/div[2]/div[2]/div/div[1]/div")
year_select_button$clickElement()
button2003 <- driver$findElement(using = "css selector", value = "div.rank-select:nth-child(2) > div:nth-child(2) > ul:nth-child(1) > li:nth-child(21)")
button2003$clickElement()

# Find the search field again.
Search_field <- driver$findElement(using = "xpath", value = "/html/body/div/div/div/div[2]/div/div[2]/div/div[1]/div/div[1]/input")

# Scrape the data.
for (i in 1:nrow(ivydata)) {
  #Search_field$clickElement()
  Search_field$sendKeysToElement(list(ivydata$uni_name[i]))
  Search_field$sendKeysToElement(list(key = "enter"))
  rank_2003 <- driver$findElement(using = 'class name', value = 'ranking')$getElementText()
  ARWU_Rank$Institution_Year[i+16] <- paste0(ivydata$uni_name[i], "_2003")
  ARWU_Rank$Rank[i+16] <- rank_2003[[1]]
  Search_field$clickElement()
  Search_field$sendKeysToElement(list(key = "control", "A"))
  Search_field$sendKeysToElement(list(key = "delete"))
}

# Modify the ranking data to integer.
for (i in 1:nrow(ARWU_Rank)) {
  ARWU_Rank$Rank[i] <- mean(as.numeric(str_extract_all(ARWU_Rank$Rank[i], "\\d+")[[1]]))
}

# View the output table.
head(ARWU_Rank)

# Write the table into my relational database.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
dbWriteTable(con, "ARWU_ranks", ARWU_Rank, row.names = FALSE, overwrite = TRUE)
# Function to check table existence and dimensionality
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
checkTable("my_database.sqlite", "ARWU_ranks")
# Navigate to the main page.
driver$navigate(url3)

# Find the subject explore button and click.
Subject_Explore_Button <- driver$findElement(using = "xpath", value = "/html/body/div/div/div/div[2]/div[2]/div[3]/div[1]/button")
Subject_Explore_Button$clickElement() 

# Wait for the system's response.
Sys.sleep(5)

# Find the social science section and get the inner html.
social_science <- driver$findElement(using = "css selector", value = "#RS05")
subject_page <- social_science$getElementAttribute('innerHTML')[[1]]

# Create a blank dataframe for storage.
Subject_Rank <- data.frame(
  Institution_Subject = c(),
  Rank = c()
)

# Count the total subjects. This is done for subsequent "for" loops.
subject_count <- read_html(subject_page) %>%
  html_nodes(".subject-list > div") %>%
  length()

# Scrape the ranks of every subject for each Ivy League university.
for (i in 1:subject_count) {
  # Find the subject button and get its text.
  subject_button <- driver$findElement(using = "css selector", value = paste0("#RS05 > div:nth-child(2) > div:nth-child(", i, ") > a:nth-child(1)"))
  subject_text <- subject_button$getElementText()
  # Click the button.
  subject_button$clickElement()
  Sys.sleep(2)
  # Find the search field on the subject ranking page.
  search_field2 <- driver$findElement(using = "xpath", value = "/html/body/div/div/div/div[2]/div/div[2]/div/div[1]/div/div[1]/input")
  
  # Search for each university and get feedback data.
  for (j in 1:nrow(ivydata)) {
    search_field2$sendKeysToElement(list(ivydata$uni_name[j]))
    search_field2$sendKeysToElement(list(key = "enter"))
    # If the university is not ranked for this subject, return NA.
    rank <- ifelse(driver$findElement(using = "xpath", value = '/html/body/div/div/div/div[2]/div/div[2]/div/div[1]/div/div[1]/div[3]')$getElementText() == "0 Institutions",
                   NA,
                   driver$findElement(using = 'class name', value = 'ranking')$getElementText())
    newrow <- list(Institution_Subject = paste0(ivydata$uni_name[j], "_", subject_text),
                   Rank = rank[1])
    Subject_Rank <- bind_rows(Subject_Rank, newrow)
    
    # Clear the search field.
    search_field2$clickElement()
    search_field2$sendKeysToElement(list(key = "control", "A"))
    search_field2$sendKeysToElement(list(key = "delete"))
  }
  # Go back to the subject exploration page.
  driver$navigate("https://www.shanghairanking.com/rankings/gras/2023")
}

# Modify the final ranking data to integer.
Subject_Rank <- unnest(Subject_Rank, cols = Rank, keep_empty = TRUE)
for (i in 1:nrow(Subject_Rank)) {
  Subject_Rank$Rank[i] <- mean(as.numeric(str_extract_all(Subject_Rank$Rank[i], "\\d+")[[1]]))
}

# View the final table.
head(Subject_Rank)

# Write the table into my relational database.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
dbWriteTable(con, "Subject_ranks", Subject_Rank, row.names = FALSE, overwrite = TRUE)
# Function to check table existence and dimensionality.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
checkTable("my_database.sqlite", "Subject_ranks")
# Load packages
library(httr)
library(jsonlite)

# Create a blank dataframe for storage.
financial_data <- data.frame()

# Gather total revenue and total assets data from a raw API for Ivy League members only.
for (i in 1:nrow(ivydata)) {
  ein <- ivydata$ein[i]
  uni <- ivydata$uni_name[i]
  url <- paste0("https://projects.propublica.org/nonprofits/api/v2/organizations/", ein, ".json")
  data <- fromJSON(url)$filings_with_data
  for (j in 1:nrow(data)) {
    newrow <- list(Institution_Year = paste0(uni, "_", data$tax_prd_yr[j]),
                   Total_Revenue = data$totrevenue[j],
                   Total_Assets = data$totassetsend[j])
    financial_data <- rbind(financial_data, newrow)
  }
}
# View the output.
head(financial_data)

# Write the table into my relational database.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
dbWriteTable(con, "Financial_data", financial_data, row.names = FALSE, overwrite = TRUE)
# Check for existence and dimensionality.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
checkTable("my_database.sqlite", "Financial_data")
# Load the package.
library(tidycensus)

# Import my private API key.
readRenviron("exercise4b.env")
apikey <- Sys.getenv("KEY")

# Retrieve the data for 2015.
us_county_income_2015 <- get_acs(geography = "county",
                     variables = "B19013_001",
                           year = 2015)
print(us_county_income_2015)

# Retrieve the data for 2020.
us_county_income_2020 <- get_acs(geography = "county",
                     variables = "B19013_001",
                           year = 2020)
print(us_county_income_2020)
# Create a blank dataframe to storage the outcome.
new_ivydata <- data.frame()

# Retrieve the county names of each Ivy university and its local Estimated Median Household Income for 2015 and 2020.
for (i in 1:nrow(ivydata)) {
  institution <- ivydata$uni_name[i]
  ins_yr2015 <- paste0(institution, "_", 2015)
  ins_yr2020 <- paste0(institution, "_", 2020)
  county <- ivydata$County[i]
  rownumber2015 <- which(us_county_income_2015$NAME == county)
  rownumber2020 <- which(us_county_income_2020$NAME == county)
  estimate2015 <- us_county_income_2015$estimate[rownumber2015]
  estimate2020 <- us_county_income_2020$estimate[rownumber2020]
  newrow2015 <- list(Institution_Year = ins_yr2015,
                     County = county,
                     Estimated_median_household_income = estimate2015)
  newrow2020 <- list(Institution_Year = ins_yr2020,
                     County = county,
                     Estimated_median_household_income = estimate2020)
  new_ivydata <- rbind(new_ivydata, newrow2015, newrow2020)
}
head(new_ivydata)

# Write the structured table into my database.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path) # Connect with local database.
dbWriteTable(con, "Ivy_League_with_Economic_Data", new_ivydata, row.names = FALSE, overwrite = TRUE)
# Check for the existence and correct dimensionality.
con <- dbConnect(RSQLite::SQLite(), dbname = db_path)
checkTable("my_database.sqlite", "Ivy_League_with_Economic_Data")
# Connect with my relational database.
mydb <- dbConnect(RSQLite::SQLite(), db_path)

# Use SQL to collect an analysis table.
analysis_table <- dbGetQuery(mydb, "
SELECT 
    e.Institution AS Institution,
    ROUND(AVG(ar.Rank)) AS Avg_ranking,
    avg_sub.Economics_Rank,
    avg_sub.Political_Science_Rank,
    avg_sub.Sociology_Rank,
    e.Endowment / e.Total_Students AS Endowment_per_Student,
    AVG(fd.Total_Revenue * 1.0 / e.Total_Students) AS Avg_Revenue_Per_Student,
    ROUND(AVG(iwe.Estimated_median_household_income)) AS Avg_median_household_income
FROM 
    Exercise2_Universities_Data AS e
LEFT JOIN 
    ARWU_ranks AS ar ON e.Institution = SUBSTR(ar.Institution_Year, 1, LENGTH(ar.Institution_Year) - 5)
LEFT JOIN 
    Financial_data AS fd ON SUBSTR(fd.Institution_Year, 1, INSTR(fd.Institution_Year, '_') - 1) = e.Institution
LEFT JOIN 
    Ivy_League_with_Economic_Data AS iwe ON e.Institution = SUBSTR(iwe.Institution_Year, 1, LENGTH(iwe.Institution_Year) - 5)
LEFT JOIN 
    (
        SELECT 
            t2.uni_name AS Institution,
            CAST(e.Rank AS FLOAT) AS Economics_Rank,
            CAST(p.Rank AS FLOAT) AS Political_Science_Rank,
            CAST(s.Rank AS FLOAT) AS Sociology_Rank
        FROM 
            Ivy_League AS t2
        LEFT JOIN 
            (SELECT Institution_Subject, Rank FROM Subject_ranks WHERE Institution_Subject LIKE '%Economics%') AS e ON t2.uni_name = SUBSTR(e.Institution_Subject, 1, INSTR(e.Institution_Subject, '_') - 1)
        LEFT JOIN 
            (SELECT Institution_Subject, Rank FROM Subject_ranks WHERE Institution_Subject LIKE '%Political Science%') AS p ON t2.uni_name = SUBSTR(p.Institution_Subject, 1, INSTR(p.Institution_Subject, '_') - 1)
        LEFT JOIN 
            (SELECT Institution_Subject, Rank FROM Subject_ranks WHERE Institution_Subject LIKE '%Sociology%') AS s ON t2.uni_name = SUBSTR(s.Institution_Subject, 1, INSTR(s.Institution_Subject, '_') - 1)
    ) AS avg_sub ON e.Institution = avg_sub.Institution
WHERE 
    e.IvyLeague_Member = 'Yes'
    AND CAST(REPLACE(SUBSTR(fd.Institution_Year, INSTR(fd.Institution_Year, '_') + 1), '_', '') AS INTEGER) BETWEEN 2015 AND 2020
GROUP BY 
    e.Institution;

           ")
head(analysis_table)

# Load ggplot package.
library(ggplot2)
library(dplyr)

# Generate a plot showing the relationship between average university ranking and average Econ/PS/Soc ranking.

plot1 <- analysis_table %>%
  ggplot()+
  
  # Add lines and points indicating average ranking and average Econ/PS/Soc ranking.
  geom_line(aes(Institution, Avg_ranking, group = "", color = "Average Rank"), size = 0.8)+ 
  geom_point(aes(x= Institution, y = Avg_ranking),size = 2.5, color = "orange", alpha = 0.6)+
  geom_line(aes(Institution, Economics_Rank, group = "", color = "Economics Rank"), size = 0.8)+
  geom_point(aes(Institution, Economics_Rank), size = 2.5, color = "lightblue", alpha = 0.8)+
  geom_line(aes(Institution, analysis_table$Political_Science_Rank, group = "",color = "Political Science Rank"), size = 0.8)+
  geom_point(aes(Institution, analysis_table$Political_Science_Rank), size = 2.5, color = "pink", alpha = 0.8)+
  geom_line(aes(Institution, Sociology_Rank, group = "",color = "Sociology Rank"), size = 0.8, alpha = 0.6)+
  geom_point(aes(Institution, Sociology_Rank), size = 2.5, color = "green", alpha = 0.6)+
  
  # To make the vision more straightforward, I consider it better to reverse the y axis so that the higher ranking an institution obtains, the higher position of the point locates. 
  scale_y_reverse()+
  
  # Set the theme of the plot.
  theme_bw()+
  scale_color_manual(values = c("Average Rank" = "orange", "Economics Rank" = "lightblue", "Political Science Rank" = "pink", "Sociology Rank" = "green"))+
  labs(color = "")+
  labs(x = "Institution", y = "Rank") +
  ggtitle("F1. Relationships between average university ranking and average subject ranking")+
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        axis.title.x = element_blank(),
        panel.border = element_blank(),
        plot.title = element_text(size = 11))

# View the plot.
plot1

# Generate a plot, showing the relationship between average university ranking and endowment per student.

plot2 <- analysis_table %>%
  ggplot() +
  aes(x = Institution)+
  
  # Add bars to the plot, indicating endowment per student of each university.
  geom_bar(aes(y = Endowment_per_Student, fill = "Endowment Per Student"), stat = "identity", alpha = 0.8, width = 0.7) +
  
  # Add a line and points to the plot, showing each university's average ranking.
  geom_line(aes(y = 4300000-analysis_table$Avg_ranking * max(analysis_table$Endowment_per_Student) / max(analysis_table$Avg_ranking), group = "", color = "Average Rank"), size = 0.8) +
  geom_point(aes(y = 4300000-analysis_table$Avg_ranking * max(analysis_table$Endowment_per_Student) / max(analysis_table$Avg_ranking), color = "Average Rank"), size = 3, alpha = 0.6) +
  
  # Add ranking text.
  geom_text(aes(y = 4300000-analysis_table$Avg_ranking * max(analysis_table$Endowment_per_Student) / max(analysis_table$Avg_ranking), label = Avg_ranking), family = "Arial", size = 3, vjust = 1.5)+
  
  # Add endowment numbers.
  geom_text(aes(Institution, Endowment_per_Student, label = paste(round(Endowment_per_Student/10^6, 2), "million")), family = "Arial", size = 2.5, vjust = -0.5)+
  
  # Set up the legends.
  guides(fill = guide_legend(title = "", override.aes = list(color = NULL)),
         color = guide_legend(title = "", override.aes = list(fill = "orange", linetype = 1))) +
  scale_fill_manual(values = "skyblue") +
  
  # Set up the theme.
  theme_bw() +
  expand_limits(y = 0) +
  ggtitle("F2. Relationships between average university ranking and endowment per student") +
  theme(
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.x = element_blank(),
    panel.border = element_blank(),
    plot.title = element_text(size = 11)
  )

# View the plot.
plot2

# Generate a plot, showing the relationship between average endowment per student and average median household income.

plot3 <- analysis_table %>%
  ggplot() +
  
  # Create two lines, one of which illustrates average median household income, and another of which shows average endowment per student of each institution.
  geom_line(aes(Institution, Avg_median_household_income, color = "average median household income", group = ""), size = 0.8, alpha = 0.6) + 
  geom_line(aes(Institution, Endowment_per_Student, group = "", color = "average endowment per student"), size = 0.8, alpha = 0.6) +
  
  # Add points to make it more clear.
  geom_point(aes(Institution, Endowment_per_Student), size = 2.5, color = "orange", alpha = 0.6)+
    geom_point(aes(Institution, Avg_median_household_income), size = 2.5, color = "lightblue", alpha = 0.6)+
  
  # Add text labels.
  geom_text(aes(Institution, Endowment_per_Student, label = paste(round(Endowment_per_Student/10^6, 2), "million")), family = "Arial", size = 2.5, vjust = -0.5)+
  geom_text(aes(Institution, Avg_median_household_income, label = Avg_median_household_income), family = "Arial", size = 2.5, vjust = -0.5)+
  
  # Set up the labs.
  labs(color = "", 
       y = "USD")+
  
  # Set up the theme.
  theme_light()+
  ggtitle("F3. Relationships between average endowment per student and average median household income")+
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        axis.title.x = element_blank(),
        panel.border = element_blank(),
        plot.title = element_text(size = 11))

# View the plot.
plot3

# Generate a plot, showing the relationship between average revenue per student and average median household income.

plot4 <- analysis_table %>%
  ggplot()+
  
  # Add a line to the plot, showing average revenue per student of each institution.
  geom_line(aes(Institution, Avg_Revenue_Per_Student, group = "", color = "average revenue per student"), size = 0.8)+
  geom_text(aes(Institution, Avg_Revenue_Per_Student, label = round(Avg_Revenue_Per_Student)), family = "Arial", size = 2.5, vjust = -0.5)+
  geom_point(aes(Institution, Avg_Revenue_Per_Student), size = 2.5, color = "skyblue", alpha = 0.8)+
  
  # Add another line to the plot, showing average median household income of each county where each institution locates at.
  geom_line(aes(Institution, Avg_median_household_income, group = "", color = "average median household income")) +
  geom_text(aes(Institution, Avg_median_household_income, label = Avg_median_household_income), family = "Arial", size = 2.5, vjust = -0.5)+
  geom_point(aes(Institution, Avg_median_household_income), size = 2.5, color = "orange", alpha = 0.6)+
  
  # Edit the labs.
  labs(color = "", 
       y = "USD")+
  
  # Set up the theme.
  theme_light()+
  ggtitle("F4. Relationships between average revenue per student and average median household income")+
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        panel.border = element_blank(),
        axis.title.x = element_blank(),
        plot.title = element_text(size = 11)
        )

# View the plot.
plot4

# Load packages
library(tmap)
library(tigris)
library(sf)
library(sp)
library(stringr)

# Call into R from my relational database a table that includes analytical data.
analysis_data2 <- dbGetQuery(mydb, " SELECT 
    Institution, 
    CAST(SUBSTR(Coordinates, 1, INSTR(Coordinates, '°') - 1) AS REAL) AS Latitude,
    -CAST(SUBSTR(SUBSTR(Coordinates, INSTR(Coordinates, 'N') + 1), 1, INSTR(SUBSTR(Coordinates, INSTR(Coordinates, 'N') + 1), '°') - 1) AS REAL) AS Longitude,
    Control AS Status,
    IvyLeague_Member
  FROM 
    Exercise2_Universities_Data
  WHERE 
    Coordinates NOT LIKE '%NA%'") 

# The output table should include the institution's name, numerical coordinates, status, and Ivy League membership.
head(analysis_data2)
# Transfer the dataframe into a sf object.
coordinates <- st_as_sf(analysis_data2, coords = c("Longitude", "Latitude"), crs = 4326)
# Get a shapefile of the US by calling status().
us_shapefile <- states()
# Switch the map mode to "view".
tmap_mode("view")

# Generate a map displaying the distribution of research universities of the US.
tm_shape(us_shapefile, bbox = c(left = -130, bottom = 24, right = -65, top = 50))+
  tm_borders(col = "lightgrey", lwd = 1, alpha = 0.7)+
  tm_text("NAME", size = 0.7)+
  tm_shape(coordinates[coordinates$IvyLeague_Member == "No", ]) + # Filter out non-Ivy League members.
  tm_bubbles(col = "Status", size = 0.2)+ 
  tm_shape(coordinates[coordinates$IvyLeague_Member == "Yes", ]) + # Filter out Ivy League Members.
  tm_bubbles(col = "orange", size = 0.3, title.col = "Ivy League Member")+
  tm_layout(legend.position = c("right", "bottom"), title = "Figure5. Mapping of Research Institutions in the United States")

# this chunk generates the complete code appendix. 
# eval=FALSE tells R not to run (``evaluate'') the code here (it was already run before).